.Rmd file as a template.(a) Write a function named diagnostics that takes as input the arguments:
model, an object of class lm(), that is a model fit via lm()pcol, for controlling point colors in plots, with a default value of greylcol, for controlling line colors in plots, with a default value of dodgerbluealpha, the significance level of any test that will be performed inside the function, with a default value of 0.05plotit, a logical value for controlling display of plots with default value TRUEtestit, a logical value for controlling outputting the results of tests with default value TRUEThe function should output:
testit is TRUE:
p_val, the p-value for the Shapiro-Wilk test for assessing normalitydecision, the decision made when performing the Shapiro-Wilk test using the alpha value input to the function. “Reject” if the null hypothesis is rejected, otherwise “Fail to Reject.”plotit is TRUE:
qqline(). The points and line should be colored according to the input arguments. Be sure the plot has a title.Consider using this function to help with the remainder of the assignment as well.
#Function created as part of the (a)
diagnostics = function(model = fit_1, pcol = "grey", lcol = "dodgerblue", alpha = 0.05, plotit = TRUE, testit = TRUE){
p_val = shapiro.test(resid(model))$'p.value'
decision = ifelse (p_val > alpha, "Fail to Reject","Reject")
list_p_decision = list(p_val=p_val,decision=decision) #Create the List
#if plotit = TRUE,
if (plotit){
par(mfrow=c(1,2))
#Fitted Versus Residual Plot
plot(fitted(model), resid(model), col = pcol, pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Versus Residual")
abline (h = 0, col = lcol, lwd = 1, lty = 1)
#Q-Q Plot
qqnorm(resid(model), col = pcol, pch = 1, cex = 1, main = "Q-Q Plot of the Model")
qqline(resid(model), col = lcol, lwd = 1, lty = 1)
}
#if testit = TRUE
if (testit){
list_p_decision
}
}
(b) Run the following code.
set.seed(420)
data_1 = data.frame(x = runif(n = 30, min = 0, max = 10),
y = rep(x = 0, times = 30))
data_1$y = with(data_1, 2 + 1 * x + rexp(n = 30))
fit_1 = lm(y ~ x, data = data_1)
data_2 = data.frame(x = runif(n = 20, min = 0, max = 10),
y = rep(x = 0, times = 20))
data_2$y = with(data_2, 5 + 2 * x + rnorm(n = 20))
fit_2 = lm(y ~ x, data = data_2)
data_3 = data.frame(x = runif(n = 40, min = 0, max = 10),
y = rep(x = 0, times = 40))
data_3$y = with(data_3, 2 + 1 * x + rnorm(n = 40, sd = x))
fit_3 = lm(y ~ x, data = data_3)
diagnostics(fit_1, plotit = FALSE)$p_val
## [1] 0.0004299
diagnostics(fit_2, plotit = FALSE)$decision
## [1] "Fail to Reject"
diagnostics(fit_1, testit = FALSE, pcol = "black", lcol = "black")
diagnostics(fit_2, testit = FALSE, pcol = "grey", lcol = "green")
diagnostics(fit_3)
## $p_val
## [1] 0.09105
##
## $decision
## [1] "Fail to Reject"
For this exercise, we will use the prostate data, which can be found in the faraway package. After loading the faraway package, use ?prostate to learn about this dataset.
library(faraway)
prostate_data = faraway::prostate
(a) Fit an additive multiple regression model with lpsa as the response and the remaining variables in the prostate dataset as predictors. Report the \(R^2\) value for this model.
prostate_add_model = lm(lpsa~.,data = prostate_data)
summary(prostate_add_model)
##
## Call:
## lm(formula = lpsa ~ ., data = prostate_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.733 -0.371 -0.017 0.414 1.638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.66934 1.29639 0.52 0.6069
## lcavol 0.58702 0.08792 6.68 2.1e-09 ***
## lweight 0.45447 0.17001 2.67 0.0090 **
## age -0.01964 0.01117 -1.76 0.0823 .
## lbph 0.10705 0.05845 1.83 0.0704 .
## svi 0.76616 0.24431 3.14 0.0023 **
## lcp -0.10547 0.09101 -1.16 0.2496
## gleason 0.04514 0.15746 0.29 0.7750
## pgg45 0.00453 0.00442 1.02 0.3089
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.708 on 88 degrees of freedom
## Multiple R-squared: 0.655, Adjusted R-squared: 0.623
## F-statistic: 20.9 on 8 and 88 DF, p-value: <2e-16
\(R^2\) value for this model is : 0.6548
(b) Check the constant variance assumption for this model. Do you feel it has been violated? Justify your answer.
plot(fitted(prostate_add_model),resid(prostate_add_model),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Vs Residual Plot for Additive Model")
abline(h = 0, col = "darkorange", lwd = 1, lty = 1)
From the above Fitted Versus Residual Plots of the Prostate Additive model, it seems it follows the constant varience, means it doesn’t violates the constant varience. Let’s check the botest as well below
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(prostate_add_model)
##
## studentized Breusch-Pagan test
##
## data: prostate_add_model
## BP = 10, df = 8, p-value = 0.3
The bptest also shows the P value which is `r bptest(prostate_add_model)$‘p.value’ is higher, means it follows constant varience
(c) Check the normality assumption for this model. Do you feel it has been violated? Justify your answer.
qqnorm(resid(prostate_add_model),col = "darkgrey", pch = 1, cex = 1, main = "Q-Q Plot for the Additive Prostate Model")
qqline(resid(prostate_add_model),col = "darkorange", lwd = 1, lty = 1)
The Q-Q plots, show all the circle follows in the straight line, except for the upper and lower part, however most of the other follows the straight line. Since it follows the straight line, the model doesn’t violates the normal distribution
(d) Check for any high leverage observations. Report any observations you determine to have high leverage.
(e) Check for any influential observations. Report any observations you determine to be influential.
(f) Refit the additive multiple regression model without any points you identified as influential. Compare the coefficients of this fitted model to the previously fitted model.
(g) Create a data frame that stores the observations that were “removed” because they were influential. Use the two models you have fit to make predictions with these observations. Comment on the difference between these two sets of predictions.
Why do we care about violations of assumptions? One key reason is that the distributions of the parameter esimators that we have used are all reliant on these assumptions. When the assumptions are violated, the distributional results are not correct, so our tests are garbage. Garbage In, Garbage Out!
Consider the following setup that we will use for the remainder of the exercise. We choose a sample size of 50.
n = 50
set.seed(420)
x_1 = runif(n, 0, 5)
x_2 = runif(n, -2, 2)
Consider the model,
\[ Y = 4 + 1 x_1 + 0 x_2 + \epsilon. \]
That is,
We now simulate y_1 in a manner that does not violate any assumptions, which we will verify. In this case \(\epsilon \sim N(0, 1).\)
library(lmtest)
set.seed(1)
y_1 = 4 + 1 * x_1 + 0 * x_2 + rnorm(n = n, mean = 0, sd = 1)
fit_1 = lm(y_1 ~ x_1 + x_2)
bptest(fit_1)
##
## studentized Breusch-Pagan test
##
## data: fit_1
## BP = 4.4, df = 2, p-value = 0.1
Then, we simulate y_2 in a manner that does violate assumptions, which we again verify. In this case \(\epsilon \sim N(0, \sigma = |x_2|).\)
set.seed(1)
y_2 = 4 + 1 * x_1 + 0 * x_2 + rnorm(n = n, mean = 0, sd = abs(x_2))
fit_2 = lm(y_2 ~ x_1 + x_2)
bptest(fit_2)
##
## studentized Breusch-Pagan test
##
## data: fit_2
## BP = 8.4, df = 2, p-value = 0.01
(a) Use the following code after changing birthday to your birthday.
num_sims = 2500
p_val_1 = rep(0, num_sims)
p_val_2 = rep(0, num_sims)
birthday = 19770411
set.seed(birthday)
for(i in 1:2500){
y_1 = 4 + 1 * x_1 + 0 * x_2 + rnorm(n = n, mean = 0, sd = 1)
fit_1 = lm(y_1 ~ x_1 + x_2)
p_val_1[i] = summary(fit_1)$coefficient[3,4]
y_2 = 4 + 1 * x_1 + 0 * x_2 + rnorm(n = n, mean = 0, sd = abs(x_2))
fit_2 = lm(y_2 ~ x_1 + x_2)
p_val_2[i] = summary(fit_2)$coefficient[3,4]
}
Repeat the above process of generating y_1 and y_2 as defined above, and fit models with each as the response 2500 times. Each time, store the p-value for testing,
\[ \beta_2 = 0, \]
using both models, in the appropriate variables defined above. (You do not need to use a data frame as we have in the past. Although, feel free to modify the code to instead use a data frame.)
(b) What proportion of the p_val_1 values is less than 0.01? Less than 0.05? Less than 0.10? What proportion of the p_val_2 values is less than 0.01? Less than 0.05? Less than 0.10? Arrange your results in a table. Briefly explain these results.
library(knitr)
Proportion_report = data.frame(
p_value = c("p_value_1","p_value_1","p_value_1","p_value_2","p_value_2","p_value_2"),
alpha_value = c("0.01","0.05","0.1","0.01","0.05","0.1"),
Proportion = c(mean(p_val_1<0.01),mean(p_val_1<0.05),mean(p_val_1<0.10),mean(p_val_2<0.01),mean(p_val_2<0.05),mean(p_val_2<0.1))
)
kable(Proportion_report, format = "pandoc",padding = 2,caption = "Summary of Proportion Results")
| p_value | alpha_value | Proportion |
|---|---|---|
| p_value_1 | 0.01 | 0.0096 |
| p_value_1 | 0.05 | 0.0492 |
| p_value_1 | 0.1 | 0.1040 |
| p_value_2 | 0.01 | 0.0292 |
| p_value_2 | 0.05 | 0.0980 |
| p_value_2 | 0.1 | 0.1672 |
| *** |
For this exercise, we will use the corrosion data, which can be found in the faraway package. After loading the faraway package, use ?corrosion to learn about this dataset.
library(faraway)
(a) Fit a simple linear regression with loss as the response and Fe as the predictor. Plot a scatterplot and add the fitted line. Check the assumptions of this model.
corrosion_data = faraway::corrosion
corrosion_slr_fit = lm(loss ~ Fe, data = corrosion_data)
corrosion_slr_fit
##
## Call:
## lm(formula = loss ~ Fe, data = corrosion_data)
##
## Coefficients:
## (Intercept) Fe
## 130 -24
#Plot the Fitted Vs Residuals of Corrosion Data
plot(fitted(corrosion_slr_fit),resid(corrosion_slr_fit),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Vs Residuals")
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
#Plot the Q-Q Plot
qqnorm(resid(corrosion_slr_fit),col = "darkgrey", pch = 1, cex = 1, main = "Q-Q Plot")
qqline(resid(corrosion_slr_fit), col = "darkorange", lwd = 2, lty = 1)
(b) Fit higher order polynomial models of degree 2, 3, and 4. For each, plot a fitted versus residuals plot and comment on the constant variance assumption. Based on those plots, which of these three models do you think are acceptable? Use a statistical test(s) to compare the models you just chose. Based on the test, which is preferred? Check the normality assumption of this model. Identify any influential observations of this model.
corrosion_model_2 = lm(loss ~ poly(Fe,2), data = corrosion_data)
corrosion_model_4 = lm(loss ~ poly(Fe,4), data = corrosion_data)
corrosion_model_6 = lm(loss ~ poly(Fe,6), data = corrosion_data)
corrosion_model_2
##
## Call:
## lm(formula = loss ~ poly(Fe, 2), data = corrosion_data)
##
## Coefficients:
## (Intercept) poly(Fe, 2)1 poly(Fe, 2)2
## 108.82 -57.39 1.66
corrosion_model_4
##
## Call:
## lm(formula = loss ~ poly(Fe, 4), data = corrosion_data)
##
## Coefficients:
## (Intercept) poly(Fe, 4)1 poly(Fe, 4)2 poly(Fe, 4)3 poly(Fe, 4)4
## 108.82 -57.39 1.66 7.42 -3.17
corrosion_model_6
##
## Call:
## lm(formula = loss ~ poly(Fe, 6), data = corrosion_data)
##
## Coefficients:
## (Intercept) poly(Fe, 6)1 poly(Fe, 6)2 poly(Fe, 6)3 poly(Fe, 6)4
## 108.82 -57.39 1.66 7.42 -3.17
## poly(Fe, 6)5 poly(Fe, 6)6
## 4.53 1.64
plot(fitted(corrosion_model_2),resid(corrosion_model_2),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Vs Residuals - Poly 2")
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
plot(fitted(corrosion_model_4),resid(corrosion_model_4),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Vs Residuals - Poly 4")
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
plot(fitted(corrosion_model_6),resid(corrosion_model_6),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Vs Residuals - Poly 6")
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
***
The data set diamonds from the ggplot2 package contains prices and characteristics of 54,000 diamonds. For this exercise, use price as the response variable \(y\), and carat as the predictor \(x\). Use ?diamonds to learn more.
library(ggplot2)
(a) Fit a linear model with price as the response variable \(y\), and carat as the predictor \(x\). Return the summary information of this model.
diamonds_data = ggplot2::diamonds
diamonds_slr = lm(price~carat,data = diamonds_data)
summary(diamonds_slr)
##
## Call:
## lm(formula = price ~ carat, data = diamonds_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18585 -805 -19 537 12732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2256.4 13.1 -173 <2e-16 ***
## carat 7756.4 14.1 551 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1550 on 53938 degrees of freedom
## Multiple R-squared: 0.849, Adjusted R-squared: 0.849
## F-statistic: 3.04e+05 on 1 and 53938 DF, p-value: <2e-16
(b) Plot a scatterplot of price versus carat and add the line for the fitted model in part (a). Using a fitted versus residuals plot and/or a Q-Q plot, comment on the diagnostics.
plot(price~carat,data = diamonds_data,col = "darkgrey", pch = 1, cex = 1,main = "Carat Vs Price of the Diamonds")
(c) Seeing as the price stretches over several orders of magnitude, it seems reasonable to try a log transformation of the response. Fit a model with a logged response, plot a scatterplot of log-price versus carat and add the line for the fitted model, then use a fitted versus residuals plot and/or a Q-Q plot to comment on the diagnostics of the model.
diamonds_slr_log = lm(log(price)~carat, data = diamonds_data)
plot(log(price)~carat, data = diamonds_data, col = "darkgrey", pch = 1, cex = 1)
abline(diamonds_slr_log,lwd = 2, col = "darkorange", lty = 1)
par(mfrow = c(1,2))
#Plot the Fitted Versus Residual Plot
plot(fitted(diamonds_slr_log),resid(diamonds_slr_log),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Versus Residual Plot of Diamond model for Log(Price)",cex.main = 0.8)
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
#Plot the Q-Q Plot
qqnorm(resid(diamonds_slr_log),col = "darkgrey", pch = 1, cex = 1, main = "Q-Q Plot of the Diamond Model for the Log(Price)",cex.main = 0.8)
qqline(resid(diamonds_slr_log),col = "darkorange", lwd = 2, lty = 1)
qplot(price, data = diamonds, bins = 30)
(d) Try adding log transformation of the predictor. Fit a model with a logged response and logged predictor, plot a scatterplot of log-price versus log-carat and add the line for the fitted model, then use a fitted versus residuals plot and/or a Q-Q plot to comment on the diagnostics of the model.
diamonds_slr_predict_log = lm(log(price)~log(carat), data = diamonds_data)
plot(log(price)~log(carat), data = diamonds_data, col = "darkgrey", pch = 1, cex = 1,cex.main = 0.8)
abline(diamonds_slr_predict_log,lwd = 2, col = "darkorange", lty = 1)
par(mfrow = c(1,2))
#Plot the Fitted Versus Residual Plot
plot(fitted(diamonds_slr_predict_log),resid(diamonds_slr_predict_log),col = "darkgrey", pch = 1, cex = 1, xlab = "Fitted", ylab = "Residuals", main = "Fitted Versus Residual Plot of Diamond model for Log(Price) and Log(carat)",cex.main = 0.8)
abline(h = 0, col = "darkorange", lwd = 2, lty = 1)
#Plot the Q-Q Plot
qqnorm(resid(diamonds_slr_predict_log),col = "darkgrey", pch = 1, cex = 1, main = "Q-Q Plot of the Diamond Model for the Log(Price) and Log(carat)",cex.main = 0.8)
qqline(resid(diamonds_slr_predict_log),col = "darkorange", lwd = 2, lty = 1)
(e) Use the model from part (d) to predict the price (in dollars) of a 3-carat diamond. Construct a 99% prediction interval for the price (in dollars).
exp(predict(diamonds_slr_predict_log,newdata = data.frame(carat = log(3)) , interval = "prediction", level = 0.99))
## fit lwr upr
## 1 5466 2779 10752